import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource
#Read dataset into python
excel= pd.read_excel("C:/Users/lujai/OneDrive/Desktop/Programming for data analytics/Data_Train.xlsx" ,sheet_name =['Sheet1'], na_values = ["na","NaN","--", " "])
df = excel.get('Sheet1')
df
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 24/03/2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 1/05/2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 9/06/2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 12/05/2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 01/03/2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10678 | Air Asia | 9/04/2019 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
| 10679 | Air India | 27/04/2019 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
| 10680 | Jet Airways | 27/04/2019 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
| 10681 | Vistara | 01/03/2019 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
| 10682 | Air India | 9/05/2019 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
10683 rows × 11 columns
Change the date format so its easier to deal with and its in a representitative format
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')
df.head(10)
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 22 Mar | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 10 Jun | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
| 5 | SpiceJet | 2019-06-24 | Kolkata | Banglore | CCU → BLR | 09:00 | 11:25 | 2h 25m | non-stop | No info | 3873 |
| 6 | Jet Airways | 2019-03-12 | Banglore | New Delhi | BLR → BOM → DEL | 18:55 | 10:25 13 Mar | 15h 30m | 1 stop | In-flight meal not included | 11087 |
| 7 | Jet Airways | 2019-03-01 | Banglore | New Delhi | BLR → BOM → DEL | 08:00 | 05:05 02 Mar | 21h 5m | 1 stop | No info | 22270 |
| 8 | Jet Airways | 2019-03-12 | Banglore | New Delhi | BLR → BOM → DEL | 08:55 | 10:25 13 Mar | 25h 30m | 1 stop | In-flight meal not included | 11087 |
| 9 | Multiple carriers | 2019-05-27 | Delhi | Cochin | DEL → BOM → COK | 11:25 | 19:15 | 7h 50m | 1 stop | No info | 8625 |
In the arrival time column , you'll see that some rows have only the time and some have time the date , which is messy and that needs to be tiedied up. Im going to remove the date from that column and to do that , we can do that by splitting the string as its an object. split it after the space
df['Arrival_Time'] = df['Arrival_Time'].apply(lambda x : x.split(' ')[0])
df
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | non-stop | No info | 3897 |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2 stops | No info | 7662 |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19h | 2 stops | No info | 13882 |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1 stop | No info | 6218 |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1 stop | No info | 13302 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10678 | Air Asia | 2019-04-09 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | non-stop | No info | 4107 |
| 10679 | Air India | 2019-04-27 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | non-stop | No info | 4145 |
| 10680 | Jet Airways | 2019-04-27 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | non-stop | No info | 7229 |
| 10681 | Vistara | 2019-03-01 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | non-stop | No info | 12648 |
| 10682 | Air India | 2019-05-09 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2 stops | No info | 11753 |
10683 rows × 11 columns
# Total stop is converted to float
df['Total_Stops']=df['Total_Stops'].map({'non-stop':0,'1 stop':1,'2 stops':2,'3 stops':3,'4 stops':4})
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10683 entries, 0 to 10682 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10683 non-null object 1 Date_of_Journey 10683 non-null datetime64[ns] 2 Source 10683 non-null object 3 Destination 10683 non-null object 4 Route 10682 non-null object 5 Dep_Time 10683 non-null object 6 Arrival_Time 10683 non-null object 7 Duration 10683 non-null object 8 Total_Stops 10682 non-null float64 9 Additional_Info 10683 non-null object 10 Price 10683 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(1), object(8) memory usage: 918.2+ KB
# see if theres any missing values in this dataset
df.isna().sum()
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 1 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 1 Additional_Info 0 Price 0 dtype: int64
# This code extracts which row had the null value in the route column
df[df['Route'].isnull()]
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9039 | Air India | 2019-05-06 | Delhi | Cochin | NaN | 09:45 | 09:25 | 23h 40m | NaN | No info | 7480 |
# there also mising values in the total stops , so to figure that out :
df[df['Total_Stops'].isnull()]
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9039 | Air India | 2019-05-06 | Delhi | Cochin | NaN | 09:45 | 09:25 | 23h 40m | NaN | No info | 7480 |
df.head(9040)
# you can see that the previous row has the same source , destination and toal stop number here which is important for the code below.
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | 0.0 | No info | 3897 |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2.0 | No info | 7662 |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19h | 2.0 | No info | 13882 |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1.0 | No info | 6218 |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1.0 | No info | 13302 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9035 | Vistara | 2019-04-12 | Banglore | Delhi | BLR → DEL | 11:30 | 14:20 | 2h 50m | 0.0 | No info | 5403 |
| 9036 | Jet Airways | 2019-06-24 | Banglore | Delhi | BLR → DEL | 06:00 | 08:45 | 2h 45m | 0.0 | In-flight meal not included | 5769 |
| 9037 | Air Asia | 2019-03-24 | Kolkata | Banglore | CCU → BLR | 10:20 | 12:55 | 2h 35m | 0.0 | No info | 5620 |
| 9038 | IndiGo | 2019-06-18 | Delhi | Cochin | DEL → COK | 05:35 | 08:50 | 3h 15m | 0.0 | No info | 5000 |
| 9039 | Air India | 2019-05-06 | Delhi | Cochin | NaN | 09:45 | 09:25 | 23h 40m | NaN | No info | 7480 |
9040 rows × 11 columns
df['Route'] = df['Route'].fillna(method='ffill')
# since the row above has the same soruce and destinaton , i just used forward fill to replacr thr NaN value in the route column
df['Total_Stops'] = df['Total_Stops'].fillna(method='ffill')
# use forward fill to replace thr NaN value in the total stopcolumn
df.head(5)
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | 0.0 | No info | 3897 |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2.0 | No info | 7662 |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19h | 2.0 | No info | 13882 |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1.0 | No info | 6218 |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1.0 | No info | 13302 |
df.isna().sum()
# to double chck that the misisng values are gone
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 0 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 0 Additional_Info 0 Price 0 dtype: int64
The duration column needs tidying up , theefore I'm splitting the duration column into hours and min as some columns only have hours and some have both(min and hr) , also im getting rid of the h and convrting the column to numeric
The code below extracts whats before the 'hr' and puts them in the duration_hour column and whats after the h is placed in the duration__minute The '?' allows for the exceptions when there's only hours and no minutes. It also converts the column to numeric and fills Na with 0 in minute column. Same is done for the minute.
df[['duration_hours', 'duration_minutes']] = df['Duration'].str.extract(r'(\d+)h(?: (\d+)m)?')
df['duration_hours'] = pd.to_numeric(df['duration_hours'])
df['duration_minutes'] = pd.to_numeric(df['duration_minutes']).fillna(0)
df.isnull().sum()
# to check if theres missing values.
Airline 0 Date_of_Journey 0 Source 0 Destination 0 Route 0 Dep_Time 0 Arrival_Time 0 Duration 0 Total_Stops 0 Additional_Info 0 Price 0 duration_hours 1 duration_minutes 0 dtype: int64
df[df['duration_hours'].isna()]
# to find out which row has the misisng value .
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | duration_hours | duration_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6474 | Air India | 2019-03-06 | Mumbai | Hyderabad | BOM → GOI → PNQ → HYD | 16:50 | 16:55 | 5m | 2.0 | No info | 17327 | NaN | 0.0 |
df.drop(6474, axis=0 , inplace=True)
# drop the row with the NaN value in the duration_hours.
# As we removed that row, index is now missing a number so we need to reset the index
df = df.reset_index(drop=True)
df
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Duration | Total_Stops | Additional_Info | Price | duration_hours | duration_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2h 50m | 0.0 | No info | 3897 | 2.0 | 50.0 |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7h 25m | 2.0 | No info | 7662 | 7.0 | 25.0 |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19h | 2.0 | No info | 13882 | 19.0 | 0.0 |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5h 25m | 1.0 | No info | 6218 | 5.0 | 25.0 |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4h 45m | 1.0 | No info | 13302 | 4.0 | 45.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10677 | Air Asia | 2019-04-09 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 2h 30m | 0.0 | No info | 4107 | 2.0 | 30.0 |
| 10678 | Air India | 2019-04-27 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 2h 35m | 0.0 | No info | 4145 | 2.0 | 35.0 |
| 10679 | Jet Airways | 2019-04-27 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 3h | 0.0 | No info | 7229 | 3.0 | 0.0 |
| 10680 | Vistara | 2019-03-01 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 2h 40m | 0.0 | No info | 12648 | 2.0 | 40.0 |
| 10681 | Air India | 2019-05-09 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 8h 20m | 2.0 | No info | 11753 | 8.0 | 20.0 |
10682 rows × 13 columns
# drop duration column
df = df.drop('Duration', axis=1)
df
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Total_Stops | Additional_Info | Price | duration_hours | duration_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 0.0 | No info | 3897 | 2.0 | 50.0 |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 2.0 | No info | 7662 | 7.0 | 25.0 |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 2.0 | No info | 13882 | 19.0 | 0.0 |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 1.0 | No info | 6218 | 5.0 | 25.0 |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 1.0 | No info | 13302 | 4.0 | 45.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10677 | Air Asia | 2019-04-09 | Kolkata | Banglore | CCU → BLR | 19:55 | 22:25 | 0.0 | No info | 4107 | 2.0 | 30.0 |
| 10678 | Air India | 2019-04-27 | Kolkata | Banglore | CCU → BLR | 20:45 | 23:20 | 0.0 | No info | 4145 | 2.0 | 35.0 |
| 10679 | Jet Airways | 2019-04-27 | Banglore | Delhi | BLR → DEL | 08:20 | 11:20 | 0.0 | No info | 7229 | 3.0 | 0.0 |
| 10680 | Vistara | 2019-03-01 | Banglore | New Delhi | BLR → DEL | 11:30 | 14:10 | 0.0 | No info | 12648 | 2.0 | 40.0 |
| 10681 | Air India | 2019-05-09 | Delhi | Cochin | DEL → GOI → BOM → COK | 10:55 | 19:15 | 2.0 | No info | 11753 | 8.0 | 20.0 |
10682 rows × 12 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10682 entries, 0 to 10681 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10682 non-null object 1 Date_of_Journey 10682 non-null datetime64[ns] 2 Source 10682 non-null object 3 Destination 10682 non-null object 4 Route 10682 non-null object 5 Dep_Time 10682 non-null object 6 Arrival_Time 10682 non-null object 7 Total_Stops 10682 non-null float64 8 Additional_Info 10682 non-null object 9 Price 10682 non-null int64 10 duration_hours 10682 non-null float64 11 duration_minutes 10682 non-null float64 dtypes: datetime64[ns](1), float64(3), int64(1), object(7) memory usage: 1001.6+ KB
Im creating a column called duration(minutes) where i convert hours to minutes , this column is going to be used for statistical analysis
df['Duration_In_Minutes']=df['duration_hours'] * 60 + df['duration_minutes']
#creating a new column to classify whether the individual , flew in economy, buisness class based on the price paid
df['Class'] = df['Price'].apply(lambda x: 'Business' if x > 12000 else 'Economy')
df.head(5)
| Airline | Date_of_Journey | Source | Destination | Route | Dep_Time | Arrival_Time | Total_Stops | Additional_Info | Price | duration_hours | duration_minutes | Duration_In_Minutes | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 0.0 | No info | 3897 | 2.0 | 50.0 | 170.0 | Economy |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 2.0 | No info | 7662 | 7.0 | 25.0 | 445.0 | Economy |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 2.0 | No info | 13882 | 19.0 | 0.0 | 1140.0 | Business |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 1.0 | No info | 6218 | 5.0 | 25.0 | 325.0 | Economy |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 1.0 | No info | 13302 | 4.0 | 45.0 | 285.0 | Business |
# enusring all columns start with captial letter, added currency to price
df.columns = [ 'Airline', 'Date_of_journey', 'Source' , 'Destination', 'Route', 'Dep_Time' , 'Arrival_Time' , 'Total_Stops', 'Additional_Info','Price_rupee', 'Duration_Hours','Duration_Minutes', 'Duration_In_Minutes', 'Class']
df.head()
| Airline | Date_of_journey | Source | Destination | Route | Dep_Time | Arrival_Time | Total_Stops | Additional_Info | Price_rupee | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 2019-03-24 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 0.0 | No info | 3897 | 2.0 | 50.0 | 170.0 | Economy |
| 1 | Air India | 2019-05-01 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 2.0 | No info | 7662 | 7.0 | 25.0 | 445.0 | Economy |
| 2 | Jet Airways | 2019-06-09 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 2.0 | No info | 13882 | 19.0 | 0.0 | 1140.0 | Business |
| 3 | IndiGo | 2019-05-12 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 1.0 | No info | 6218 | 5.0 | 25.0 | 325.0 | Economy |
| 4 | IndiGo | 2019-03-01 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 1.0 | No info | 13302 | 4.0 | 45.0 | 285.0 | Business |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10682 entries, 0 to 10681 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10682 non-null object 1 Date_of_journey 10682 non-null datetime64[ns] 2 Source 10682 non-null object 3 Destination 10682 non-null object 4 Route 10682 non-null object 5 Dep_Time 10682 non-null object 6 Arrival_Time 10682 non-null object 7 Total_Stops 10682 non-null float64 8 Additional_Info 10682 non-null object 9 Price_rupee 10682 non-null int64 10 Duration_Hours 10682 non-null float64 11 Duration_Minutes 10682 non-null float64 12 Duration_In_Minutes 10682 non-null float64 13 Class 10682 non-null object dtypes: datetime64[ns](1), float64(4), int64(1), object(8) memory usage: 1.1+ MB
df['Date_of_journey']=df['Date_of_journey'].dt.strftime("%Y-%m-%d %H:%M:%S")
# convert date to string so i can split it
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10682 entries, 0 to 10681 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10682 non-null object 1 Date_of_journey 10682 non-null object 2 Source 10682 non-null object 3 Destination 10682 non-null object 4 Route 10682 non-null object 5 Dep_Time 10682 non-null object 6 Arrival_Time 10682 non-null object 7 Total_Stops 10682 non-null float64 8 Additional_Info 10682 non-null object 9 Price_rupee 10682 non-null int64 10 Duration_Hours 10682 non-null float64 11 Duration_Minutes 10682 non-null float64 12 Duration_In_Minutes 10682 non-null float64 13 Class 10682 non-null object dtypes: float64(4), int64(1), object(9) memory usage: 1.1+ MB
df["Journey_year"] = df['Date_of_journey'].str.split('-').str[0].astype(int)
df["Journey_month"] = df['Date_of_journey'].str.split('-').str[1].astype(int)
df.drop(["Date_of_journey"], axis = 1, inplace = True)
# this was done so month and year can be used for statistical analysis.
# price are skewed so I'm going to create a column with the log of price , this column will be used for analysis
df['Log_price']=np.log(df['Price_rupee'])
df.head(5)
| Airline | Source | Destination | Route | Dep_Time | Arrival_Time | Total_Stops | Additional_Info | Price_rupee | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Class | Journey_year | Journey_month | Log_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 0.0 | No info | 3897 | 2.0 | 50.0 | 170.0 | Economy | 2019 | 3 | 8.267962 |
| 1 | Air India | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 2.0 | No info | 7662 | 7.0 | 25.0 | 445.0 | Economy | 2019 | 5 | 8.944028 |
| 2 | Jet Airways | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 2.0 | No info | 13882 | 19.0 | 0.0 | 1140.0 | Business | 2019 | 6 | 9.538348 |
| 3 | IndiGo | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 1.0 | No info | 6218 | 5.0 | 25.0 | 325.0 | Economy | 2019 | 5 | 8.735204 |
| 4 | IndiGo | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 1.0 | No info | 13302 | 4.0 | 45.0 | 285.0 | Business | 2019 | 3 | 9.495670 |
# chnge the order of the columns to make it look neater
cols = df.columns.tolist()
cols
['Airline', 'Source', 'Destination', 'Route', 'Dep_Time', 'Arrival_Time', 'Total_Stops', 'Additional_Info', 'Price_rupee', 'Duration_Hours', 'Duration_Minutes', 'Duration_In_Minutes', 'Class', 'Journey_year', 'Journey_month', 'Log_price']
ordered_columns = ['Airline', 'Journey_month', 'Journey_year', 'Source' , 'Destination', 'Route', 'Dep_Time' , 'Arrival_Time' , 'Duration_Hours' , 'Duration_Minutes','Duration_In_Minutes','Total_Stops', 'Price_rupee' , 'Log_price','Class','Additional_Info']
df= df[ordered_columns].copy()
df.head(5)
| Airline | Journey_month | Journey_year | Source | Destination | Route | Dep_Time | Arrival_Time | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Total_Stops | Price_rupee | Log_price | Class | Additional_Info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2.0 | 50.0 | 170.0 | 0.0 | 3897 | 8.267962 | Economy | No info |
| 1 | Air India | 5 | 2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7.0 | 25.0 | 445.0 | 2.0 | 7662 | 8.944028 | Economy | No info |
| 2 | Jet Airways | 6 | 2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19.0 | 0.0 | 1140.0 | 2.0 | 13882 | 9.538348 | Business | No info |
| 3 | IndiGo | 5 | 2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5.0 | 25.0 | 325.0 | 1.0 | 6218 | 8.735204 | Economy | No info |
| 4 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4.0 | 45.0 | 285.0 | 1.0 | 13302 | 9.495670 | Business | No info |
df.to_csv("C:/Users/lujai/OneDrive/Desktop/df.csv" ,index=False)
# CLASS PLOT
sns.countplot(x='Class',data=df,palette='dark')
plt.title('Class Count')
Text(0.5, 1.0, 'Class Count')
As per the plot above , we can see that most of the data collected was from the ecomomy class.
#Airline plot
plt.figure(figsize=(8, 6))
sns.countplot(x='Airline', data=df, palette='viridis')
plt.xlabel('Airline')
plt.ylabel('Count (Log Scale)')
plt.yscale('log')
plt.title('Bar Plot of airline with Logarithmic Scale')
plt.xticks(rotation=45, ha='right')
plt.show()
Log of the airline value was used to plot this graph as simply using 'count' isnt accurate as it didnt show values with the lower count
It's visible from the plot that 'Jet Airways' is the most common used airline in India whereas 'Trujet' is the least common used.
The distribution is skewed to the right
# total stops plot
sns.countplot(x=df["Total_Stops"], data=df)
plt.title("Count of Stops")
plt.show()
#destination plot
plt.figure(figsize=(10, 6))
df["Destination"].value_counts().plot(kind='bar', color='skyblue' , width=0.5 ,)
plt.title("Frequency of Destination City")
plt.xlabel("Destination City")
plt.ylabel("Count")
plt.show()
This barplot shows that 'Cochin' is the most visited destination in India
#SOURCE PLOT
plt.figure(figsize=(10, 6))
df["Source"].value_counts().plot(kind='bar', color='pink' , width=0.5 ,)
plt.title("Frequency of source City")
plt.xlabel("Source City")
plt.ylabel("Count")
plt.show()
Delhi is the most common airport that the airlines leave from.
# DURATION MIN PLOT
sns.boxplot(x=df["Duration_In_Minutes"], data=df)
plt.title("Duration(min)")
plt.show()
df['Duration_In_Minutes'].mean()
643.152967609062
Even though the mean is around 643, we can see here that the median is approximately 500. Theres outliers in the data with longer flights than the rest of the flights
df['Log_price'].mean()
8.989072368432629
#PRICE PLOT
plt.figure(figsize = (18,5))
plt.subplot(1,2,2)
sns.histplot(x = 'Log_price', data = df, kde = True ,bins=10 , color='purple')
plt.subplot(1,2,1)
plt.suptitle('Count of price(₹)', fontsize=16)
sns.boxplot(x = 'Log_price', data = df)
<Axes: xlabel='Log_price'>
Even though the mean is around 8.9, we can see here that the median is approximately around the same just slightley higher you can see the distribution is roughly symmetrical theres outliers present in this data
#ADDITIONAL INFO PLOT
plt.figure(figsize=(10, 6))
df["Additional_Info"].value_counts().plot(kind='bar', color='pink' , width=0.5 ,)
plt.title("Frequency of Additional_Info")
plt.xlabel("Additional_Info")
plt.ylabel("Count")
plt.show()
The plot above isnt the best way to plot 'additional info' as its not displaying the majority of the variables so plot of log of the variable would be more accurate represenration ... see next cell for the plot
plt.figure(figsize=(8, 6))
sns.countplot(x='Additional_Info', data=df, palette='viridis')
plt.xlabel('Additional_Info')
plt.ylabel('Count (Log Scale)')
plt.yscale('log')
plt.title('Bar Plot of additional info with Logarithmic Scale')
plt.xticks(rotation=45, ha='right')
plt.show()
Log scale is used since its only the 'info' has higher count and other variables arent visible , this shows that the majority of the passengers didnt get a meal or extra luggage etc.
# price vs total stops plot
plt.figure(figsize=(12,5))
sns.boxplot(x='Total_Stops',y='Log_price', data=df,palette='pastel')
plt.title('Plot of no. of total stops with Logarithmic Scale of price(₹)')
plt.show()
alot of outliers are present in stop number : 0. The IQR overlaps in stop number 2 and 3 , with similar medians . We can conclude theres a relationship between the variables
# Price vs airline plot
ax = sns.barplot(x=df['Airline'],y=df['Log_price'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel('Plot of airline vs price(₹) ' , y=.05)
plt.show()
Visible from the plot that the 'Jet Airway Business' charge the highest for their flights.
No relationship/ pattern between airline alone and prices as they all seem to be charging around the same more or less
# price vs destination
sns.boxplot(x='Destination' , y='Log_price', data=df)
plt.title('Plot of relationship between destination and price (₹)')
Text(0.5, 1.0, 'Plot of relationship between destination and price (₹)')
New Delhi , Banglore and Cochin seem to have similar flight prices , their median is similar and their IQR overlap Whereas , the prices for Kolkata , Delhi and Hyderbad charge lower. Minimum value for all destinatio(except hyderbad) are approx the same.
# price vs source
sns.boxplot(x='Source' , y='Log_price', data=df)
plt.title('Plot of relationship between source and price(₹)')
Text(0.5, 1.0, 'Plot of relationship between source and price(₹)')
Flights leaving Banglore, Kolkata and Delhi seem to charge higher tahn chennai and Mumbai. Minimum value of the flights are approx the same excetof for Mumbai , it looks to cheaper
# price , airline , class
plt.figure(figsize=(20, 10))
sns.barplot(x='Airline',y='Log_price',hue="Class",data=df.sort_values("Log_price"))
plt.xticks(rotation=45)
plt.title('Airline prices based on the class and company', fontsize=20)
plt.show()
Business flights are only available in all companies, excpet Trujet, vistara and multiple carriers. Also, there isnt seem to be a huge gap between the prices in the two class , they all have the same ratio. Jet airway Business has the hgihest prices and it doesnt offer economy tickets. This suggests that theres a relationship between class and prices
# additional info vs price
sns.barplot(x='Additional_Info' , y='Log_price' , data=df)
plt.xticks(rotation=60)
plt.title('Airline price based on extra request ', fontsize=20)
plt.show()
This plot shows out of all of the extra request avaliable, buiness class is the most expensive option avaliable .There's no pattern so this suggests there are no relationship.
# price vs class
sns.boxplot(x='Class' , y='Log_price', data=df)
plt.title('Flight price based on class')
Text(0.5, 1.0, 'Flight price based on class')
Business class is more expensive than the economy class , their IQR dont overlap. There are alot of outliers in business class. There's a relationship between the two variables.
df_grouped = df.groupby('Duration_In_Minutes')['Log_price'].mean().reset_index()
source = ColumnDataSource(df_grouped)
output_notebook()
p = figure(title="Average prizes depending on the duration", plot_height=350, plot_width=800)
p.scatter(x="Duration_In_Minutes", y="Log_price", source=source, size=10, color="navy", alpha=0.5)
p.xaxis.axis_label = "Duration_In_Minutes"
p.yaxis.axis_label = "Price(₹)"
p.title.text_font_size = "15pt"
show(p)
This plot was plotted using groupby (mean) to get more of an accurate representation . As the duration increase , the price increase which suggests a relationship
# price vs duration , hue = class
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Duration_In_Minutes', y='Log_price' , hue='Class', data=df, color='blue')
plt.xlabel('Duration (minutes)')
plt.ylabel('Price')
plt.title('Scatter Plot of Price(₹) vs Duration')
plt.show()
.This plot shows that even for the same length of flights , business class is more expensive. There's clusters in this plot( no overlap). representation . As the duration increase , the price increase. However, no relationship between class and duration of flight.
for the code below : create a bookeh figure with width, height and title and tools we want to be visble then create a source.then for the palette i chose the catory1-o_5 due to total stops having 5 variables . then i plotted the scatter , size 8 , with the color being the differnt colurs of the total stop and added a legend also added hover tool to pick a certain point if necessary.
# price vs duration vs total stops
from bokeh.palettes import Category10_5
from bokeh.transform import factor_cmap
# Convert 'Total_Stops' to string type as numerical variable wont work for the colour paletter
df['Total_Stops'] = df['Total_Stops'].astype(str)
p = figure(plot_width=800, plot_height=500, title='Scatter Plot of Price vs Duration' , tools="pan,box_zoom,reset,save")
source = ColumnDataSource(df)
output_notebook()
color_mapper = factor_cmap('Total_Stops', palette=Category10_5, factors=df['Total_Stops'].unique())
p.scatter(x='Duration_In_Minutes', y='Log_price', size=8, color=color_mapper, legend_field='Total_Stops', source=source)
p.xaxis.axis_label = 'Duration (minutes)'
p.yaxis.axis_label = 'Price'
p.legend.title = 'Total Stops'
hover = HoverTool()
hover.tooltips = [('Duration', '@Duration_In_Minutes minutes'), ('Price', '₹@{Log_price}')]
p.add_tools(hover)
show(p)
This bookeh plot shows that the majority of longer flights have fewer stops , howver the price range seems to be the same with the exception of few outliers. There seem to be slight relationship
sns.boxplot(x='Journey_month' , y='Log_price', data=df)
plt.title('Flight price tickets based on journey')
Text(0.5, 1.0, 'Flight price tickets based on journey month')
Theres a strong relatioship between month and price as visible in the plot. in 2019, march , may and june , IQR overlap and they have similar median , its not the case for april . Theres many outliers in march
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10682 entries, 0 to 10681 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10682 non-null object 1 Journey_month 10682 non-null int32 2 Journey_year 10682 non-null int32 3 Source 10682 non-null object 4 Destination 10682 non-null object 5 Route 10682 non-null object 6 Dep_Time 10682 non-null object 7 Arrival_Time 10682 non-null object 8 Duration_Hours 10682 non-null float64 9 Duration_Minutes 10682 non-null float64 10 Duration_In_Minutes 10682 non-null float64 11 Total_Stops 10682 non-null object 12 Price_rupee 10682 non-null int64 13 Log_price 10682 non-null float64 14 Class 10682 non-null object 15 Additional_Info 10682 non-null object dtypes: float64(4), int32(2), int64(1), object(9) memory usage: 1.2+ MB
Data visulisation - plot are shown above.
df['Duration_In_Minutes'].describe()
count 10682.000000 mean 643.152968 std 507.848241 min 75.000000 25% 170.000000 50% 520.000000 75% 930.000000 max 2860.000000 Name: Duration_In_Minutes, dtype: float64
Mean of duration_In_Minutes is 643 and the mean is 520 which shows that it could be roughly symmetrical but the difference is due to the outliers are shown above
df['Log_price'].describe()
count 10682.000000 mean 8.989072 std 0.513579 min 7.472501 25% 8.571113 50% 9.032648 75% 9.423272 max 11.283663 Name: Log_price, dtype: float64
average price of flight is is 8.98rupee.
df['Total_Stops'].describe()
count 10682.000000 mean 0.824003 std 0.675180 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 4.000000 Name: Total_Stops, dtype: float64
average number of stops is 0.82 .
df['Airline'].describe()
count 10682 unique 12 top Jet Airways freq 3849 Name: Airline, dtype: object
There are 12 unique airlines used in the dataset . most common airline is jet airways .
df['Destination'].describe()
count 10682 unique 6 top Cochin freq 4537 Name: Destination, dtype: object
There are 6 unique destinations used in this dataset cochin is the most popular destination
df['Source'].describe()
count 10682 unique 5 top Delhi freq 4537 Name: Source, dtype: object
There are 5 unique sources in this dataset. Delhi is the most common city that the airlines depart from
df['Duration_Hours']=df['Duration_Hours'].astype('str')
# changes the type of the duration hours to string
df['Duration_Minutes']=df['Duration_Minutes'].astype('str')
# changes the type of the duration hours to string
df['Price_rupee']=df['Price_rupee'].astype('str')
df['Journey_year']=df['Journey_year'].astype('str')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10682 entries, 0 to 10681 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airline 10682 non-null object 1 Journey_month 10682 non-null int32 2 Journey_year 10682 non-null object 3 Source 10682 non-null object 4 Destination 10682 non-null object 5 Route 10682 non-null object 6 Dep_Time 10682 non-null object 7 Arrival_Time 10682 non-null object 8 Duration_Hours 10682 non-null object 9 Duration_Minutes 10682 non-null object 10 Duration_In_Minutes 10682 non-null float64 11 Total_Stops 10682 non-null float64 12 Price_rupee 10682 non-null object 13 Log_price 10682 non-null float64 14 Class 10682 non-null object 15 Additional_Info 10682 non-null object dtypes: float64(3), int32(1), object(12) memory usage: 1.3+ MB
sns.pairplot(df)
plt.show()
plt.figure(figsize = (15,15))
sns.heatmap(df.corr(), annot = True, cmap = "RdYlGn")
plt.show()
C:\Users\lujai\AppData\Local\Temp\ipykernel_10760\924879886.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df.corr(), annot = True, cmap = "RdYlGn")
correlation between log_price and journey month is -0.0.62 which is negative which suggests that its either weak negative correlation or theres no correlation at all correlation between log_price and duration_in_minutes is 0.6 which suggests its a moderate positive correlation correlation between log_price and total stops is 0.69 , stronger than price vs duration but still not strong enough correlation between journey month and total stops is 0.055 which is weak postiive correlation correlation between journey month and duration in minute is 0.015 , also weak positive correlation correlation between total stops and duration in minutes is 0.74 which is fairly strong correlation
# 2 sample independent t test:
# H0 mean_class_price =mean_business_price (i.e. no relationship)
# H1: mean_economy_price!=mean_business_price (i.e. relationship)
from scipy.stats import ttest_ind
economy_airline=df[df["Class"] == 'Economy']['Log_price']
business_airline=df[df["Class"] == 'Business']['Log_price']
res=ttest_ind(economy_airline,business_airline)
res
Ttest_indResult(statistic=-104.23892458930132, pvalue=0.0)
# check if the variables met the normality assumptions
group1 = economy_airline
group2 = business_airline
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(group1, kde=True, color='blue', label='economy_airline')
sns.histplot(group2, kde=True, color='orange', label='business_airline')
plt.title('Histograms for Normality Check')
plt.legend()
plt.subplot(1, 2, 2)
stats.probplot(group1, plot=plt, rvalue=True)
plt.title('Q-Q Plot for Indigo Price')
plt.show()
# print("Test statistic:", result_bartlett.statistic)
# print("P-value:", result_bartlett.pvalue)
The distribution is symmetrical and for the normality QQplot is roughly normal but the presence of outliers make it look skewed so we can interpret the pvalue of t-test. Pvalue is less than 0.05 therfore its significant and we can rejct the null hypothesis and conclude theres a relationship between price and class.
# H0: Duration of the flight has no effect on price (i.e. no relationship)
#H1: Duration of the flight has an effect on flight pries(i.e. relationship)
from statsmodels.formula.api import ols
model= ols('Log_price~ Duration_In_Minutes', data=df).fit()
model.summary()
| Dep. Variable: | Log_price | R-squared: | 0.355 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.355 |
| Method: | Least Squares | F-statistic: | 5875. |
| Date: | Sat, 09 Dec 2023 | Prob (F-statistic): | 0.00 |
| Time: | 00:37:55 | Log-Likelihood: | -5697.5 |
| No. Observations: | 10682 | AIC: | 1.140e+04 |
| Df Residuals: | 10680 | BIC: | 1.141e+04 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 8.6016 | 0.006 | 1335.493 | 0.000 | 8.589 | 8.614 |
| Duration_In_Minutes | 0.0006 | 7.86e-06 | 76.651 | 0.000 | 0.001 | 0.001 |
| Omnibus: | 314.486 | Durbin-Watson: | 1.986 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 395.499 |
| Skew: | 0.355 | Prob(JB): | 1.31e-86 |
| Kurtosis: | 3.621 | Cond. No. | 1.32e+03 |
# We got to check the assumptions before we use ANOVA
# 1) check for the variance
#fitted values
model_fitted_vals = model.fittedvalues
#model residuals
model_residuals = model.resid
#standardised residuals
model_norm_residuals = model.get_influence().resid_studentized_internal
sns.regplot(x=model_fitted_vals,y=model_residuals,
ci=False,lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
#meets the assumptions as the variance is the same throughout the plot. However its important to note the presence of outliers
Text(0, 0.5, 'Residuals')
stats.probplot(model_norm_residuals, plot=sns.mpl.pyplot)
plt.show()
# the plot meets the anova assumptions of normality so ANOVA can be used.
from statsmodels.stats.anova import anova_lm
anovaResults = anova_lm(model, typ=1)
anovaResults
| df | sum_sq | mean_sq | F | PR(>F) | |
|---|---|---|---|---|---|
| Duration_In_Minutes | 1.0 | 999.820727 | 999.820727 | 5875.343309 | 0.0 |
| Residual | 10680.0 | 1817.440242 | 0.170172 | NaN | NaN |
p_value = anovaResults['PR(>F)'][0]
print(f"P-value for F-statistic: {p_value}")
P-value for F-statistic: 0.0
From the anova , pvalue is less than 0.05 therefore its significant and we can conclude duration of flight has a relationship with price. Looking at the model summary , 35% of the variation of the model is explained by duration which is pretty low and therefore can suggest maybe theres interaction between duration and other variable that would be a better variable.
# test for interaction
modelinteraction_1= ols('Log_price ~Duration_In_Minutes*Total_Stops', data=df).fit()
modelinteraction_1.summary()
| Dep. Variable: | Log_price | R-squared: | 0.557 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.557 |
| Method: | Least Squares | F-statistic: | 4474. |
| Date: | Thu, 07 Dec 2023 | Prob (F-statistic): | 0.00 |
| Time: | 21:24:29 | Log-Likelihood: | -3690.7 |
| No. Observations: | 10682 | AIC: | 7389. |
| Df Residuals: | 10678 | BIC: | 7419. |
| Df Model: | 3 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 8.3698 | 0.007 | 1244.033 | 0.000 | 8.357 | 8.383 |
| Duration_In_Minutes | 0.0006 | 1.5e-05 | 42.760 | 0.000 | 0.001 | 0.001 |
| Total_Stops | 0.6177 | 0.009 | 69.778 | 0.000 | 0.600 | 0.635 |
| Duration_In_Minutes:Total_Stops | -0.0004 | 9.83e-06 | -39.232 | 0.000 | -0.000 | -0.000 |
| Omnibus: | 484.107 | Durbin-Watson: | 1.986 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 879.190 |
| Skew: | 0.358 | Prob(JB): | 1.22e-191 |
| Kurtosis: | 4.210 | Cond. No. | 4.30e+03 |
# Looking at the R^2,it increased by approx. 20% when we looked at the interaction between duration and number of total stops.
# p value is significant
# H0: median of price = median of total stops (i.e. no relationship)
#H1: median of price != median of total stops (ie. relationship )
# complete man-whiteny test
res = stats.mannwhitneyu(df['Log_price'], df['Total_Stops'], alternative='two-sided')
print(res.statistic)
print(res.pvalue)
114105124.0 0.0
p value is less than 0.05 so we can reject the null hypothesis number of total stops have an impact on the price of flights
# H0: airline doens't impact price of flights (i.e. no relationship)
# H1: airline impacts price of flights (i.e. relationship)
df.head(5)
| Airline | Journey_month | Journey_year | Source | Destination | Route | Dep_Time | Arrival_Time | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Total_Stops | Price_rupee | Log_price | Class | Additional_Info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2.0 | 50.0 | 170.0 | 0.0 | 3897 | 8.267962 | Economy | No info |
| 1 | Air India | 5 | 2019 | Kolkata | Banglore | CCU → IXR → BBI → BLR | 05:50 | 13:15 | 7.0 | 25.0 | 445.0 | 2.0 | 7662 | 8.944028 | Economy | No info |
| 2 | Jet Airways | 6 | 2019 | Delhi | Cochin | DEL → LKO → BOM → COK | 09:25 | 04:25 | 19.0 | 0.0 | 1140.0 | 2.0 | 13882 | 9.538348 | Business | No info |
| 3 | IndiGo | 5 | 2019 | Kolkata | Banglore | CCU → NAG → BLR | 18:05 | 23:30 | 5.0 | 25.0 | 325.0 | 1.0 | 6218 | 8.735204 | Economy | No info |
| 4 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4.0 | 45.0 | 285.0 | 1.0 | 13302 | 9.495670 | Business | No info |
df_IndiGO = df[((df.Airline == 'IndiGo') & (df.Source == 'Banglore') & (df.Destination == 'New Delhi'))]
The above code extracts from the indigo airline , where the source is banglore and destination is new delhi , stored in a dataframe calldd df_IndiGo
df_IndiGO
| Airline | Journey_month | Journey_year | Source | Destination | Route | Dep_Time | Arrival_Time | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Total_Stops | Price | Class | Additional_Info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2.0 | 50.0 | 170.0 | 0.0 | 3897 | Economy | No info |
| 4 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4.0 | 45.0 | 285.0 | 1.0 | 13302 | Economy | No info |
| 52 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2.0 | 50.0 | 170.0 | 0.0 | 4377 | Economy | No info |
| 157 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 18:55 | 21:45 | 2.0 | 50.0 | 170.0 | 0.0 | 8855 | Economy | No info |
| 161 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 00:40 | 03:25 | 2.0 | 45.0 | 165.0 | 0.0 | 4777 | Economy | No info |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10274 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 08:30 | 11:30 | 3.0 | 0.0 | 180.0 | 0.0 | 6860 | Economy | No info |
| 10315 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → HYD → DEL | 12:10 | 17:05 | 4.0 | 55.0 | 295.0 | 1.0 | 4410 | Economy | No info |
| 10318 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 23:30 | 02:20 | 2.0 | 50.0 | 170.0 | 0.0 | 5694 | Economy | No info |
| 10351 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 13:00 | 15:50 | 2.0 | 50.0 | 170.0 | 0.0 | 6144 | Economy | No info |
| 10604 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 20:00 | 22:55 | 2.0 | 55.0 | 175.0 | 0.0 | 6144 | Economy | No info |
157 rows × 15 columns
df_jet_airways = df[((df.Airline == 'Jet Airways') & (df.Source == 'Banglore') & (df.Destination == 'New Delhi'))]
df_jet_airways
| Airline | Journey_month | Journey_year | Source | Destination | Route | Dep_Time | Arrival_Time | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Total_Stops | Price | Class | Additional_Info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 18:55 | 10:25 | 15.0 | 30.0 | 930.0 | 1.0 | 11087 | Economy | In-flight meal not included |
| 7 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 08:00 | 05:05 | 21.0 | 5.0 | 1265.0 | 1.0 | 22270 | Business | No info |
| 8 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 08:55 | 10:25 | 25.0 | 30.0 | 1530.0 | 1.0 | 11087 | Economy | In-flight meal not included |
| 38 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 07:00 | 13:15 | 6.0 | 15.0 | 375.0 | 1.0 | 19225 | Economy | No info |
| 71 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 05:45 | 08:15 | 26.0 | 30.0 | 1590.0 | 1.0 | 9134 | Economy | In-flight meal not included |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10579 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 14:05 | 09:30 | 19.0 | 25.0 | 1165.0 | 1.0 | 17261 | Economy | No info |
| 10611 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 08:00 | 21:20 | 13.0 | 20.0 | 800.0 | 1.0 | 11087 | Economy | In-flight meal not included |
| 10637 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 21:25 | 11:25 | 14.0 | 0.0 | 840.0 | 1.0 | 7832 | Economy | In-flight meal not included |
| 10666 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 22:55 | 20:20 | 21.0 | 25.0 | 1285.0 | 1.0 | 11087 | Economy | In-flight meal not included |
| 10673 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 20:35 | 21:20 | 24.0 | 45.0 | 1485.0 | 1.0 | 11087 | Economy | In-flight meal not included |
418 rows × 15 columns
# I extracted 2 airlines with same deistination and source ot compare prices
# we concat them to make the analysis easier
joined = pd.concat([df_IndiGO , df_jet_airways])
joined
| Airline | Journey_month | Journey_year | Source | Destination | Route | Dep_Time | Arrival_Time | Duration_Hours | Duration_Minutes | Duration_In_Minutes | Total_Stops | Price_rupee | Log_price | Class | Additional_Info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2.0 | 50.0 | 170.0 | 0.0 | 3897 | 8.267962 | Economy | No info |
| 4 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → NAG → DEL | 16:50 | 21:35 | 4.0 | 45.0 | 285.0 | 1.0 | 13302 | 9.495670 | Business | No info |
| 52 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 22:20 | 01:10 | 2.0 | 50.0 | 170.0 | 0.0 | 4377 | 8.384119 | Economy | No info |
| 157 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 18:55 | 21:45 | 2.0 | 50.0 | 170.0 | 0.0 | 8855 | 9.088738 | Economy | No info |
| 161 | IndiGo | 3 | 2019 | Banglore | New Delhi | BLR → DEL | 00:40 | 03:25 | 2.0 | 45.0 | 165.0 | 0.0 | 4777 | 8.471568 | Economy | No info |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10579 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 14:05 | 09:30 | 19.0 | 25.0 | 1165.0 | 1.0 | 17261 | 9.756205 | Business | No info |
| 10611 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 08:00 | 21:20 | 13.0 | 20.0 | 800.0 | 1.0 | 11087 | 9.313529 | Economy | In-flight meal not included |
| 10637 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 21:25 | 11:25 | 14.0 | 0.0 | 840.0 | 1.0 | 7832 | 8.965973 | Economy | In-flight meal not included |
| 10666 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 22:55 | 20:20 | 21.0 | 25.0 | 1285.0 | 1.0 | 11087 | 9.313529 | Economy | In-flight meal not included |
| 10673 | Jet Airways | 3 | 2019 | Banglore | New Delhi | BLR → BOM → DEL | 20:35 | 21:20 | 24.0 | 45.0 | 1485.0 | 1.0 | 11087 | 9.313529 | Economy | In-flight meal not included |
575 rows × 16 columns
joined[joined.Airline == 'IndiGo']['Log_price']
#so im extracting the price from the indigo airline in the joined df
0 8.267962
4 9.495670
52 8.384119
157 9.088738
161 8.471568
...
10274 8.833463
10315 8.391630
10318 8.647168
10351 8.723231
10604 8.723231
Name: Log_price, Length: 157, dtype: float64
joined[joined.Airline == 'Jet Airways']['Log_price']
# this extracts the price from the jet airway airline in the joined df
6 9.313529
7 10.010996
8 9.313529
38 9.863967
71 9.119759
...
10579 9.756205
10611 9.313529
10637 8.965973
10666 9.313529
10673 9.313529
Name: Log_price, Length: 418, dtype: float64
from scipy.stats import ttest_ind
Indigo_price=joined[joined["Airline"] == 'IndiGo']['Log_price']
jet_price=joined[joined['Airline'] == 'Jet Airways']['Log_price']
# perform t test
res=ttest_ind(Indigo_price, jet_price)
res
# before we interpret the t test , we must test the assumptions
Ttest_indResult(statistic=-18.340755646921174, pvalue=1.8519995705058648e-59)
# Check Normality Assumption
group1 = Indigo_price
group2 = jet_price
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(group1, kde=True, color='blue', label='Indigo Price')
sns.histplot(group2, kde=True, color='orange', label='jet Price')
plt.title('Histograms for Normality Check')
plt.legend()
plt.subplot(1, 2, 2)
stats.probplot(group1, plot=plt, rvalue=True)
plt.title('Q-Q Plot for Indigo Price')
plt.show()
Distribution is symmetrical and normality assumptions are also met therefore we can interpret the p value from the t-test. p-value less than 0.05 so we can reject the null hypothesis and conclude airline and flight prices have a relationship
modelinteraction= ols('Log_price ~ C(Class)*C(Airline)', data=df).fit()
modelinteraction.summary()
| Dep. Variable: | Log_price | R-squared: | 0.688 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.687 |
| Method: | Least Squares | F-statistic: | 1174. |
| Date: | Thu, 07 Dec 2023 | Prob (F-statistic): | 0.00 |
| Time: | 21:32:27 | Log-Likelihood: | -1822.3 |
| No. Observations: | 10682 | AIC: | 3687. |
| Df Residuals: | 10661 | BIC: | 3839. |
| Df Model: | 20 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 9.5158 | 0.166 | 57.376 | 0.000 | 9.191 | 9.841 |
| C(Class)[T.Economy] | -0.9503 | 0.167 | -5.703 | 0.000 | -1.277 | -0.624 |
| C(Airline)[T.Air India] | 0.0426 | 0.166 | 0.256 | 0.798 | -0.284 | 0.369 |
| C(Airline)[T.GoAir] | 0.2206 | 0.210 | 1.052 | 0.293 | -0.191 | 0.632 |
| C(Airline)[T.IndiGo] | 0.0517 | 0.170 | 0.305 | 0.761 | -0.281 | 0.384 |
| C(Airline)[T.Jet Airways] | 0.0763 | 0.166 | 0.460 | 0.646 | -0.249 | 0.402 |
| C(Airline)[T.Jet Airways Business] | 1.4434 | 0.203 | 7.106 | 0.000 | 1.045 | 1.842 |
| C(Airline)[T.Multiple carriers] | 0.0751 | 0.166 | 0.451 | 0.652 | -0.251 | 0.401 |
| C(Airline)[T.Multiple carriers Premium economy] | 0.0450 | 0.235 | 0.192 | 0.848 | -0.415 | 0.505 |
| C(Airline)[T.SpiceJet] | 0.0885 | 0.219 | 0.403 | 0.687 | -0.342 | 0.519 |
| C(Airline)[T.Trujet] | -0.1185 | 0.144 | -0.824 | 0.410 | -0.400 | 0.163 |
| C(Airline)[T.Vistara] | -0.0291 | 0.171 | -0.170 | 0.865 | -0.364 | 0.306 |
| C(Airline)[T.Vistara Premium economy] | 0.2487 | 0.083 | 2.984 | 0.003 | 0.085 | 0.412 |
| C(Class)[T.Economy]:C(Airline)[T.Air India] | 0.3019 | 0.167 | 1.804 | 0.071 | -0.026 | 0.630 |
| C(Class)[T.Economy]:C(Airline)[T.GoAir] | -0.2150 | 0.211 | -1.017 | 0.309 | -0.629 | 0.199 |
| C(Class)[T.Economy]:C(Airline)[T.IndiGo] | -0.0692 | 0.171 | -0.406 | 0.685 | -0.404 | 0.265 |
| C(Class)[T.Economy]:C(Airline)[T.Jet Airways] | 0.3744 | 0.167 | 2.243 | 0.025 | 0.047 | 0.701 |
| C(Class)[T.Economy]:C(Airline)[T.Jet Airways Business] | -1.669e-16 | 4.74e-17 | -3.520 | 0.000 | -2.6e-16 | -7.39e-17 |
| C(Class)[T.Economy]:C(Airline)[T.Multiple carriers] | 0.3962 | 0.168 | 2.365 | 0.018 | 0.068 | 0.725 |
| C(Class)[T.Economy]:C(Airline)[T.Multiple carriers Premium economy] | 0.6547 | 0.252 | 2.597 | 0.009 | 0.161 | 1.149 |
| C(Class)[T.Economy]:C(Airline)[T.SpiceJet] | -0.3563 | 0.220 | -1.618 | 0.106 | -0.788 | 0.075 |
| C(Class)[T.Economy]:C(Airline)[T.Trujet] | -0.1185 | 0.144 | -0.824 | 0.410 | -0.400 | 0.163 |
| C(Class)[T.Economy]:C(Airline)[T.Vistara] | 0.2933 | 0.172 | 1.703 | 0.089 | -0.044 | 0.631 |
| C(Class)[T.Economy]:C(Airline)[T.Vistara Premium economy] | 0.2487 | 0.083 | 2.984 | 0.003 | 0.085 | 0.412 |
| Omnibus: | 921.129 | Durbin-Watson: | 1.977 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 2265.404 |
| Skew: | -0.516 | Prob(JB): | 0.00 |
| Kurtosis: | 5.006 | Cond. No. | 1.82e+18 |
from statsmodels.graphics.factorplots import interaction_plot
interaction_plot(df['Airline'], df['Class'], df['Log_price'],
colors=['orange','blue'], markers=['D','^'])
plt.xticks(rotation=45)
plt.title('Interaction Plot of Response by final weight and diet type')
plt.show()
This plots the mean of the variables. From Looking at the plot , we can see that they're not parralel and they're heading in a direction where they will meet at some point wwhich concludes that interaction occurs.
Analysing the model summary report, we can see that the R2 is 69 % when the variables airlines and class arent treated as independent, we can conclude 69% of the variation in the model is explained by these variables
#check if the model2interaction meets the ANOVA assumptions before we conduct a two way ANOVA TEST
#fitted values
model_fitted_vals = model2interaction.fittedvalues
#model residuals
model_residuals = model2interaction.resid
#standardised residuals
model_norm_residuals = model2interaction.get_influence().resid_studentized_internal
sns.regplot(x=model_fitted_vals,y=model_residuals,
ci=False,lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.show()
stats.probplot(model_norm_residuals, plot=sns.mpl.pyplot)
plt.show()
Assumptions are met , its just outliers that is making it look uneven. However , we can conduct a ANOVA test
# H0: no interaction
#H1 : there is interaction
anova2way = anova_lm(model2interaction, typ=2)
anova2way
C:\Users\lujai\anaconda3\lib\site-packages\statsmodels\base\model.py:1871: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 11, but rank is 8
warnings.warn('covariance of constraints does not have full '
C:\Users\lujai\anaconda3\lib\site-packages\statsmodels\base\model.py:1871: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 11, but rank is 10
warnings.warn('covariance of constraints does not have full '
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Class) | 629.801224 | 1.0 | 7632.191674 | 0.000000e+00 |
| C(Airline) | 646.521620 | 11.0 | 712.256031 | 0.000000e+00 |
| C(Class):C(Airline) | 19.553479 | 11.0 | 21.541559 | 2.633468e-40 |
| Residual | 879.735617 | 10661.0 | NaN | NaN |
All p values are less than 0.05 so therefore are siginifacant and we can reject the null hypothesis and conclude there is an interaction.
df1 = df.sample(400, replace=True)
# reduced sample size for more accurate p values
#h0: full model is prefered
#h1: reduced model is prefered
# Since the predictors I'm interested in turned out to be significant,so we're going to test full model with interaction
#Full model with interaction
FULL_MODEL= ols('Log_price ~ C(Class)+C(Airline)+Total_Stops + Duration_In_Minutes+Journey_month *Journey_month:C(Airline)', data=df1).fit()
FULL_MODEL.summary()
| Dep. Variable: | Log_price | R-squared: | 0.801 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.791 |
| Method: | Least Squares | F-statistic: | 84.94 |
| Date: | Sat, 09 Dec 2023 | Prob (F-statistic): | 3.80e-121 |
| Time: | 00:20:37 | Log-Likelihood: | 15.944 |
| No. Observations: | 400 | AIC: | 6.112 |
| Df Residuals: | 381 | BIC: | 81.95 |
| Df Model: | 18 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 8.0630 | 0.769 | 10.490 | 0.000 | 6.552 | 9.574 |
| C(Class)[T.Economy] | -0.4570 | 0.032 | -14.180 | 0.000 | -0.520 | -0.394 |
| C(Airline)[T.Air India] | 1.0448 | 0.776 | 1.347 | 0.179 | -0.480 | 2.570 |
| C(Airline)[T.GoAir] | 1.8786 | 0.868 | 2.164 | 0.031 | 0.172 | 3.586 |
| C(Airline)[T.IndiGo] | 1.0130 | 0.775 | 1.308 | 0.192 | -0.510 | 2.536 |
| C(Airline)[T.Jet Airways] | 1.1278 | 0.772 | 1.461 | 0.145 | -0.390 | 2.646 |
| C(Airline)[T.Multiple carriers] | 1.2060 | 0.777 | 1.552 | 0.121 | -0.322 | 2.734 |
| C(Airline)[T.SpiceJet] | 1.2882 | 0.804 | 1.602 | 0.110 | -0.292 | 2.869 |
| C(Airline)[T.Vistara] | 1.1001 | 0.816 | 1.348 | 0.179 | -0.505 | 2.705 |
| Total_Stops | 0.2646 | 0.028 | 9.433 | 0.000 | 0.209 | 0.320 |
| Duration_In_Minutes | 7.828e-05 | 3.57e-05 | 2.192 | 0.029 | 8.07e-06 | 0.000 |
| Journey_month | 0.1413 | 0.136 | 1.039 | 0.300 | -0.126 | 0.409 |
| Journey_month:C(Airline)[T.Air India] | -0.1533 | 0.139 | -1.107 | 0.269 | -0.426 | 0.119 |
| Journey_month:C(Airline)[T.GoAir] | -0.3521 | 0.163 | -2.165 | 0.031 | -0.672 | -0.032 |
| Journey_month:C(Airline)[T.IndiGo] | -0.1893 | 0.138 | -1.372 | 0.171 | -0.461 | 0.082 |
| Journey_month:C(Airline)[T.Jet Airways] | -0.1396 | 0.137 | -1.018 | 0.309 | -0.409 | 0.130 |
| Journey_month:C(Airline)[T.Multiple carriers] | -0.1503 | 0.138 | -1.086 | 0.278 | -0.422 | 0.122 |
| Journey_month:C(Airline)[T.SpiceJet] | -0.2931 | 0.145 | -2.023 | 0.044 | -0.578 | -0.008 |
| Journey_month:C(Airline)[T.Vistara] | -0.1511 | 0.153 | -0.987 | 0.324 | -0.452 | 0.150 |
| Omnibus: | 39.001 | Durbin-Watson: | 2.091 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 144.749 |
| Skew: | -0.326 | Prob(JB): | 3.70e-32 |
| Kurtosis: | 5.874 | Cond. No. | 1.53e+05 |
#fitted values
model_fitted_vals = FULL_MODEL.fittedvalues
#model residuals
model_residuals = FULL_MODEL.resid
#standardised residuals
model_norm_residuals = FULL_MODEL.get_influence().resid_studentized_internal
C:\Users\lujai\anaconda3\lib\site-packages\statsmodels\stats\outliers_influence.py:696: RuntimeWarning: invalid value encountered in sqrt return self.resid / sigma / np.sqrt(1 - hii)
sns.regplot(x=model_fitted_vals,y=model_residuals,
ci=False,lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.show()
stats.probplot(model_norm_residuals, plot=sns.mpl.pyplot)
plt.show()
# it meets the normality assumptions
def graph(formula, x_range, label=None):
"""
Helper function for plotting cook's distance lines
"""
x = x_range
y = formula(x)
plt.plot(x, y, label=label, lw=1, ls='--', color='red')
# leverage, from statsmodels internals
model_leverage = FULL_MODEL.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = FULL_MODEL.get_influence().cooks_distance[0]
plot_cooks = plt.figure();
plt.scatter(model_leverage, model_norm_residuals, alpha=0.5);
sns.regplot(x=model_leverage, y=model_norm_residuals,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8});
plot_cooks.axes[0].set_xlim(0, max(model_leverage)+0.01)
plot_cooks.axes[0].set_ylim(-4, 4)
plot_cooks.axes[0].set_title('Residuals vs Leverage')
plot_cooks.axes[0].set_xlabel('Leverage')
plot_cooks.axes[0].set_ylabel('Standardized Residuals');
leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
for i in leverage_top_3:
plot_cooks.axes[0].annotate(i,
xy=(model_leverage[i],
model_norm_residuals[i]));
p = len(FULL_MODEL.params)
graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
np.linspace(0.001, max(model_leverage), 50),
'Cook\'s distance') # 0.5 line
graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
np.linspace(0.001, max(model_leverage), 50)) # 1 line
plot_cooks.legend(loc='upper right');
plt.show()
C:\Users\lujai\AppData\Local\Temp\ipykernel_10760\2298950797.py:31: RuntimeWarning: invalid value encountered in sqrt graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x), C:\Users\lujai\AppData\Local\Temp\ipykernel_10760\2298950797.py:34: RuntimeWarning: invalid value encountered in sqrt graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
Assumptions of equal variance and normality have to be met before we can interpret the model summary. From the plots we can see that both assumptions are met however its important to note that the outliers present make the variance look unequal. we can now interpret the model summary: 81% of the variatation is explained by the model. most of the p-value are significant except few certain airlines
Interpreting the model summary:
EQUATION : log_price = 8.0630 -0.4570class_economy + 1.0448airline_airindia+1.8786Airline_GoAir+1.0130Airline_IndiGo+1.1278Airline_JetAirways+1.2060Airline_Multiplecarrier+1.2882Airline_spiceJet+1.1001Airline_vistara+0.2646Total_stops+7.828e- 05duration_in_min+0.1413joruneymonth-0.1533journey_month_airline_airindia-0.3521journey_month:airline_goAir-0.189journey_month:airline_indiGo-0.136journey_month_airlinejetariways-0.1503journey_month:airline_multiplecarriers-0.2931journey_month:airline_spicejet-0.1511journey_month:airline_vistra
The p value of class_economy is less than 0.05 so therefore it suggests that class is significant in predicting the price of flights
Only the p value of airline_GoAir is less than 0.05 , the rest are higher , therefore Airline arent significant in predicting price
The p value of totalstops is less than 0.05 , The total number of stops is statistically significant.
The p value of duration is less than 0.05 , The duration is statistically significant.
The p value of journey month is greater than 0.05 therefore the journey of the month is not statistically significant
The p value of the interaction betwene joureny month and airline vary ,some are significant and some are not.
R^2 is 80% which suggests that 80% of the variance explained by the model AIC: 6.112 BIC:81.95
df1 = df.sample(400, replace=True)
# reduced model: I've removed interaction and journey month as its not significant
RED_MODEL= ols('Log_price ~ C(Class)+C(Airline)+Total_Stops + Duration_In_Minutes', data=df1).fit()
RED_MODEL.summary()
| Dep. Variable: | Log_price | R-squared: | 0.790 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.784 |
| Method: | Least Squares | F-statistic: | 146.1 |
| Date: | Sat, 09 Dec 2023 | Prob (F-statistic): | 4.68e-125 |
| Time: | 00:51:34 | Log-Likelihood: | 5.4156 |
| No. Observations: | 400 | AIC: | 11.17 |
| Df Residuals: | 389 | BIC: | 55.07 |
| Df Model: | 10 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 8.8658 | 0.078 | 114.192 | 0.000 | 8.713 | 9.018 |
| C(Class)[T.Economy] | -0.4628 | 0.032 | -14.394 | 0.000 | -0.526 | -0.400 |
| C(Airline)[T.Air India] | 0.1995 | 0.077 | 2.593 | 0.010 | 0.048 | 0.351 |
| C(Airline)[T.GoAir] | 0.1561 | 0.127 | 1.224 | 0.222 | -0.095 | 0.407 |
| C(Airline)[T.IndiGo] | -0.0014 | 0.072 | -0.019 | 0.985 | -0.143 | 0.140 |
| C(Airline)[T.Jet Airways] | 0.3399 | 0.073 | 4.680 | 0.000 | 0.197 | 0.483 |
| C(Airline)[T.Multiple carriers] | 0.3693 | 0.076 | 4.859 | 0.000 | 0.220 | 0.519 |
| C(Airline)[T.SpiceJet] | -0.2230 | 0.084 | -2.655 | 0.008 | -0.388 | -0.058 |
| C(Airline)[T.Vistara] | 0.2641 | 0.093 | 2.825 | 0.005 | 0.080 | 0.448 |
| Total_Stops | 0.2551 | 0.028 | 9.030 | 0.000 | 0.200 | 0.311 |
| Duration_In_Minutes | 8.459e-05 | 3.61e-05 | 2.346 | 0.019 | 1.37e-05 | 0.000 |
| Omnibus: | 37.268 | Durbin-Watson: | 2.135 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 121.645 |
| Skew: | -0.354 | Prob(JB): | 3.85e-27 |
| Kurtosis: | 5.607 | Cond. No. | 1.36e+04 |
#fitted values
model_fitted_vals_2 = RED_MODEL.fittedvalues
#model residuals
model_residuals_2 = RED_MODEL.resid
#standardised residuals
model_norm_residuals_2 = RED_MODEL.get_influence().resid_studentized_internal
sns.regplot(x=model_fitted_vals_2,y=model_residuals_2,
ci=False,lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.show()
stats.probplot(model_norm_residuals_2, plot=sns.mpl.pyplot)
plt.show()
# it meets the normality assumptions
def graph(formula, x_range, label=None):
"""
Helper function for plotting cook's distance lines
"""
x = x_range
y = formula(x)
plt.plot(x, y, label=label, lw=1, ls='--', color='red')
model_leverage = RED_MODEL.get_influence().hat_matrix_diag
# cook's distance, from statsmodels internals
model_cooks = RED_MODEL.get_influence().cooks_distance[0]
plot_cooks = plt.figure();
plt.scatter(model_leverage, model_norm_residuals_2, alpha=0.5);
sns.regplot(x=model_leverage, y=model_norm_residuals_2,
scatter=False,
ci=False,
lowess=True,
line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8});
plot_cooks.axes[0].set_xlim(0, max(model_leverage)+0.01)
plot_cooks.axes[0].set_ylim(-4, 4)
plot_cooks.axes[0].set_title('Residuals vs Leverage')
plot_cooks.axes[0].set_xlabel('Leverage')
plot_cooks.axes[0].set_ylabel('Standardized Residuals');
leverage_top_3 = np.flip(np.argsort(model_cooks), 0)[:3]
for i in leverage_top_3:
plot_cooks.axes[0].annotate(i,
xy=(model_leverage[i],
model_norm_residuals_2[i]));
p = len(RED_MODEL.params)
graph(lambda x: np.sqrt((0.5 * p * (1 - x)) / x),
np.linspace(0.001, max(model_leverage), 50),
'Cook\'s distance') # 0.5 line
graph(lambda x: np.sqrt((1 * p * (1 - x)) / x),
np.linspace(0.001, max(model_leverage), 50)) # 1 line
plot_cooks.legend(loc='upper right');
plt.show()
Assumptions of equal variance and normality have to be met in the reduced model before we can interpret the model summary. From the plots we can see that both assumptions are met however its important to note that the outliers present make the variance look unequal. we can now interpret the model summary:
EQUATION: price = 8.8658-0.4628class_economy+0.1995airline_airindia+0.1561airline_goair-0.0014airline_indiGo+0.3399airline_jetairways+0.3693airline_multiplecarrier-0.2230 airline_spicejet+0.2641airline_vistra+0.2551+8.59e-5duration_in_minutes
Intercept 8.8658: The predicted Log_price is 8.8658 when all predictor variables are zero. class_economy-0.4628: the Log_price in the Economy class is predicted to drop by 0.4628 units. Airline_AirIndia0.1995: The Log_price is predicted to rise by 0.1995 units in relation to the reference airline when the airline is Air India.
Airline_GoAir0.1561: The Log_price is predicted to rise by 0.1561 units in comparison to the reference airline when the airline is GoAir.
Airline_IndiGo-0.0014 is modest and lacks statistical significance, indicating that the impact of IndiGo might not be substantial.
Airline_Jet Airways0.3399: The Log_price is expected to rise by 0.3399 units in comparison to the reference airline when the airline is Jet Airways.
Airline_Multiple carriers 0.3693: The Log_price is predicted to rise by 0.3693 units in comparison to the reference airline when the airline is Multiple carriers.
Airline_SpiceJet-0.2230 : The projected difference in the Log_price between SpiceJet and the reference airline is 0.2230 units.
Airline_Vistara 0.2641: The Log_price is expected to rise by 0.2641 units in comparison to the reference airline when the airline is Vistara.
Total_Stops 0.2551: The Log_price is expected to rise by 0.2551 units for every extra stop.
Duration_In_Minutes 8.459e-05: The Log_price is predicted to rise by 8.459e-05 units for every extra minute of duration.
79% of the variatation is explained by the model. all of the p-value are significant except certain airlines log-likelihood decreased in the reduced model both AIC and BIC increased in the reduced model.
we can see that the BIC and AIC increased in the redeuced model also, R2 increased in the reduced model
p value of the full model:3.80e-121 p value of the reduced model : 4.68e-125 Since the models met the assumptions of the anova, we can conduct ANOVA test
#H0: full model is prefered.
#H1: reduced model is prefered
anovaResults = anova_lm(RED_MODEL,FULL_MODEL)
anovaResults
| df_resid | ssr | df_diff | ss_diff | F | Pr(>F) | |
|---|---|---|---|---|---|---|
| 0 | 389.0 | 22.794279 | 0.0 | NaN | NaN | NaN |
| 1 | 381.0 | 21.625372 | 8.0 | 1.168907 | 2.574254 | 0.009557 |
looking at the anova test , the p value are less than 0.05 therefore its significant and we can reject the null hypothesis and conclude that the reduced model is preferred.
rss_reduced = np.sum(FULL_MODEL.resid ** 2)
rss_full = np.sum(RED_MODEL.resid ** 2)
print(rss_reduced)
print(rss_full)
# residual sum square decreased in the reduced model
21.625371599731196 22.79427880897562
# conclude: reduced model is preferred.